R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(fpp2)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
library(xray)
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.5.2
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
dataset <- read.csv('TT2/data2.csv')
summary(dataset)
##  Financial_Year  Branch_Code       Sequence_1       Sequence_2   
##  Min.   :14.00   BRC-01:193503   Min.   :   345   Min.   :    1  
##  1st Qu.:15.00   BRC-02: 25126   1st Qu.: 11289   1st Qu.: 7572  
##  Median :16.00   BRC-03:  1251   Median : 22564   Median :16388  
##  Mean   :15.87   BRC-05:   239   Mean   : 34540   Mean   :17716  
##  3rd Qu.:17.00   BRC-06:  1763   3rd Qu.: 35659   3rd Qu.:27430  
##  Max.   :18.00   BRC-07:  3616   Max.   :122099   Max.   :89061  
##                                                   NA's   :2      
##  Donation_type      Donor_Age         Donation_Date    Gender    
##  R      :119386   Min.   : -60.04   26-SEP-13:   281    :    14  
##  T      : 64420   1st Qu.:  23.50   22-AUG-13:   269   F:   129  
##  M      : 19709   Median :  28.00   19-SEP-13:   266   M:225355  
##  N      : 15304   Mean   :  29.40   28-AUG-13:   253             
##  P      :  6598   3rd Qu.:  33.00   23-AUG-13:   251             
##  A      :    67   Max.   :2002.34   06-OCT-13:   249             
##  (Other):    14   NA's   :16455     (Other)  :223929             
##  Blood_Group_Code  Donor_Weight    Donor_Temperature  Donor_Pulse    
##  Min.   : 0.0     Min.   :-80.00   Min.   :  0.30    Min.   :  0.00  
##  1st Qu.: 3.0     1st Qu.: 65.00   1st Qu.: 37.00    1st Qu.: 72.00  
##  Median : 3.0     Median : 72.00   Median : 37.00    Median : 72.00  
##  Mean   : 3.6     Mean   : 73.73   Mean   : 38.02    Mean   : 74.57  
##  3rd Qu.: 5.0     3rd Qu.: 80.00   3rd Qu.: 37.00    3rd Qu.: 72.00  
##  Max.   :17.0     Max.   :980.00   Max.   :999.00    Max.   :872.00  
##  NA's   :539      NA's   :32827    NA's   :32830     NA's   :32829   
##  Donor_Hemoglobin Donor_Blood_Pressure     Test_1          C1        
##  Min.   :  0.13   120/80 :179530       Min.   :   0.0000    :  2498  
##  1st Qu.: 13.00          : 32831       1st Qu.:   0.1230   I:   146  
##  Median : 13.00   120/70 :  2092       Median :   0.1830   N:222846  
##  Mean   : 13.34   80/120 :   805       Mean   :   0.3376   R:     8  
##  3rd Qu.: 13.00   120/90 :   780       3rd Qu.:   0.2940             
##  Max.   :714.00   120//80:   773       Max.   :1183.0220             
##  NA's   :32830    (Other):  8687       NA's   :2563                  
##      Test_2         C2         Test_3     Test_4    
##  Min.   :   0.001    :  2614    :   262    :   660  
##  1st Qu.:   0.183   I:  3761   N:221563   N:224836  
##  Median :   0.222   N:219068   P:   293   P:     1  
##  Mean   :  47.756   P:     2   R:  3380   R:     1  
##  3rd Qu.:   0.264   R:    53                        
##  Max.   :8071.204                                   
##  NA's   :2646
#  extracting the main columns from dataset
# for time series analysis

time_data <- dataset %>%
  select(Branch_Code,Donation_type,Donation_Date)
head(time_data)
##   Branch_Code Donation_type Donation_Date
## 1      BRC-01             R     01-JUL-13
## 2      BRC-01             R     01-JUL-13
## 3      BRC-01             R     01-JUL-13
## 4      BRC-01             R     01-JUL-13
## 5      BRC-01             T     01-JUL-13
## 6      BRC-01             T     01-JUL-13
summary(time_data)
##  Branch_Code     Donation_type      Donation_Date   
##  BRC-01:193503   R      :119386   26-SEP-13:   281  
##  BRC-02: 25126   T      : 64420   22-AUG-13:   269  
##  BRC-03:  1251   M      : 19709   19-SEP-13:   266  
##  BRC-05:   239   N      : 15304   28-AUG-13:   253  
##  BRC-06:  1763   P      :  6598   23-AUG-13:   251  
##  BRC-07:  3616   A      :    67   06-OCT-13:   249  
##                  (Other):    14   (Other)  :223929
# converting date column from dataset

time_data$Donation_Date <- as.Date(time_data$Donation_Date, format = "%d-%b-%y")
head(time_data)
##   Branch_Code Donation_type Donation_Date
## 1      BRC-01             R    2013-07-01
## 2      BRC-01             R    2013-07-01
## 3      BRC-01             R    2013-07-01
## 4      BRC-01             R    2013-07-01
## 5      BRC-01             T    2013-07-01
## 6      BRC-01             T    2013-07-01
#  remove the missing values from data 
#  ther is almost 3~4 values are missing in selected data

time_data <- na.omit(time_data)

# check min and max date from dataset

min(time_data$Donation_Date)
## [1] "1974-04-04"
max(time_data$Donation_Date)
## [1] "2018-07-06"
#  Starting date from the finincial_year which is July 1st

time_data <- time_data %>%
  filter(Donation_Date >= "2013-07-01")


# select data of specific branch 
# because there are different branch in dataset 
# and different count of people goto different branch 
# we select 1st branch

branch_data_fun <-  function(branch){
  brac_data <- time_data %>%
    filter(Branch_Code %in% branch)
}


# call above function that subset data on Branch column

branch_data <- branch_data_fun("BRC-01")

head(branch_data)
##   Branch_Code Donation_type Donation_Date
## 1      BRC-01             R    2013-07-01
## 2      BRC-01             R    2013-07-01
## 3      BRC-01             R    2013-07-01
## 4      BRC-01             R    2013-07-01
## 5      BRC-01             T    2013-07-01
## 6      BRC-01             T    2013-07-01
# creating a data.frame that contain just the date and no donor that 
# came to specific branch for donation

agg_brac_date <-aggregate(x = branch_data[,3],
  by = list(Date = branch_data$Donation_Date), 
  FUN = length)
head(agg_brac_date)
##         Date   x
## 1 2013-07-01  98
## 2 2013-07-02 135
## 3 2013-07-03 121
## 4 2013-07-04 113
## 5 2013-07-05 122
## 6 2013-07-06 102
#  checking if there is any missing date from the 
# sequence which is not included in above data frame

alldates = seq(min(agg_brac_date$Date), max(agg_brac_date$Date), 1)


#  checking if there is some date that is not included in above data frame

dates0 = alldates[!(alldates %in% agg_brac_date$Date)]

# creating a data.frame of missing dates  

data0 <- data.frame()
if(length(dates0)!=0){
  data0 <- data.frame(Date = dates0,x = 0)   
}


# bind the missing date data to orignal data in data.frame
# it contains the date and no of donor came on that date

full_data_branch = rbind(agg_brac_date, data0)
# Order the data by date column
full_data_branch = full_data_branch[order(full_data_branch$Date),]
head(full_data_branch)
##         Date   x
## 1 2013-07-01  98
## 2 2013-07-02 135
## 3 2013-07-03 121
## 4 2013-07-04 113
## 5 2013-07-05 122
## 6 2013-07-06 102
# printing the outliers 

outliers <- boxplot.stats(full_data_branch$x)$out
outliers
##  [1] 184 211 228 203 189 195 189 190 207 235 202 183 233 184 184 203 183
## [18] 191 197 183 221 195 223 192 195 191 184  26   0   2   1   1   1   1
#  ploting the boxplot to check the outliers
hcboxplot(x = full_data_branch$x) %>%
  hc_chart(type = "column")
# calculating the mean and median of data 
# then find the average of both of them 
# and replace it with the outliers

meann <- mean(full_data_branch$x)+median(full_data_branch$x)
meann <- meann/2
meann
## [1] 104.3114
# data without outliers
xx <- full_data_branch %>%
  filter(!full_data_branch$x %in% c(outliers))

#  data with outliers
yy <- full_data_branch %>%
  filter(full_data_branch$x %in% outliers)

# replace outlier with the average of mean and median
yy$x <- meann

# append both xx,yy dataframe
# bind the clean and smooth data

full_data_branch <- rbind(xx,yy)

# Data after smoothing 

head(full_data_branch,10)
##          Date   x
## 1  2013-07-01  98
## 2  2013-07-02 135
## 3  2013-07-03 121
## 4  2013-07-04 113
## 5  2013-07-05 122
## 6  2013-07-06 102
## 7  2013-07-07  99
## 8  2013-07-08 143
## 9  2013-07-09 124
## 10 2013-07-10 156
my_ts_br <- ts(full_data_branch$x , start = c(2013,7), frequency = 365)

# Summary of data

summary(my_ts_br)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    27.0    86.0   103.0   104.6   122.0   181.0
# head of data

head(my_ts_br,20)
## Time Series:
## Start = c(2013, 7) 
## End = c(2013, 26) 
## Frequency = 365 
##  [1]  98 135 121 113 122 102  99 143 124 156  93 110  97  86 120 100 127
## [18] 146 144 110
# Dividing the dataset in Train and Test Data
# --------
train <- window(
  my_ts_br,
  end=c(2017,5))

test <- window(
  my_ts_br,
  start=c(2017,6))

# Head of train data 

head(train)
## Time Series:
## Start = c(2013, 7) 
## End = c(2013, 12) 
## Frequency = 365 
## [1]  98 135 121 113 122 102
# Head of test data 

head(test)
## Time Series:
## Start = c(2017, 6) 
## End = c(2017, 11) 
## Frequency = 365 
## [1] 120 117 131  96 112 120
# Decomposing time series object to Analyze dataset

decomp <- stl(log(my_ts_br), "per")

summary(decomp)
##  Call:
##  stl(x = log(my_ts_br), s.window = "per")
## 
##  Time.series components:
##     seasonal              trend            remainder         
##  Min.   :-0.3898126   Min.   :4.508096   Min.   :-1.1864296  
##  1st Qu.:-0.0876822   1st Qu.:4.559432   1st Qu.:-0.1258090  
##  Median : 0.0025861   Median :4.609144   Median : 0.0178137  
##  Mean   : 0.0002303   Mean   :4.616294   Mean   :-0.0016538  
##  3rd Qu.: 0.0943307   3rd Qu.:4.661327   3rd Qu.: 0.1476118  
##  Max.   : 0.3099108   Max.   :4.790346   Max.   : 0.6414138  
##  IQR:
##      STL.seasonal STL.trend STL.remainder data  
##      0.1820       0.1019    0.2734        0.3497
##    %  52.1         29.1      78.2         100.0 
## 
##  Weights: all == 1
## 
##  Other components: List of 5
##  $ win  : Named num [1:3] 18321 549 365
##  $ deg  : Named int [1:3] 0 1 1
##  $ jump : Named num [1:3] 1833 55 37
##  $ inner: int 2
##  $ outer: int 0
# Look at trend, season and remainder to observe data

ggplotly(autoplot(decomp))
# Plot the time series data

plot_ly(x = ~full_data_branch\(Date, y = ~full_data_branch\)x, mode = ‘lines’) %>% layout( title = " Sales“, xaxis = list( title =”Date“, rangeselector = list( buttons = list( list( count = 3, label =”3 mo“, step =”month“, stepmode =”backward“), list( count = 6, label =”6 mo“, step =”month“, stepmode =”backward“), list( count = 1, label =”1 yr“, step =”year“, stepmode =”backward“), list( count = 1, label =”YTD“, step =”year“, stepmode =”todate“), list(step =”all"))),

  rangeslider = list(type = "date")),
yaxis = list(title = "Count"))
# It will use to plot the data over different years

ggplotly(ggseasonplot(my_ts_br,polar = F))
# These are some functions to calculata accuracy
# -----------------------------
# Mean Square Error

MSE <- function(y,yhat)
{
  mean((y-yhat)**2)
}

# Mean absolute Error
MAE <- function(y,yhat)
{
  mean(abs(y-yhat))
}

# Mean absolute precentage error
MAPE <- function(y,yhat,percent=TRUE)
{
  if(percent){
    100*mean(abs( (y-yhat)/y ))
  } else {
    mean(abs( (y-yhat)/y ))
  }
}

MAE <- function(y,yhat)
{
  mean(abs(y-yhat))
}

# Mean absolute precentage error
MAPE <- function(y,yhat,percent=TRUE)
{
  if(percent){
    100*mean(abs( (y-yhat)/y ))
  } else {
    mean(abs( (y-yhat)/y ))
  }
}

# create models_accuracy dataframe

models_accuracy <- data.frame(modelname = as.character(),mse=as.character(),mae=as.character(),mape=as.character())




# This is our 2nd model
# Which is Stlf model
stlf.fit <- stlf(train,h=length(test),robust = T,lambda = "auto")

yHat <- stlf.fit$mean

mse <- MSE(test,yHat)

mae <- MAE(test,yHat)

mape <- MAPE(test,yHat)


models_accuracy <- data.frame(model.name="STLF",mse = mse, mae = mae, mape = mape)



ggplotly(autoplot(stlf.fit))
# simple Forcasting 
forecas.fit <- forecast(train,h = length(test))

yHat <- forecas.fit$mean

mse <- MSE(test,yHat)

mae <- MAE(test,yHat)

mape <- MAPE(test,yHat)

models_accuracy <- rbind(models_accuracy,data.frame(model.name = "simple forecas",mse=mse , mae = mae,mape = mape))

ggplotly(autoplot(forecas.fit))
#  This will predict using neural network
nn.fit <- forecast(nnetar(train, lambda=0.9,),h=length(test))

yHat <- nn.fit$mean

mse <- MSE(test,yHat)

mae <- MAE(test,yHat)

mape <- MAPE(test,yHat)


ggplotly(autoplot(nn.fit))
models_accuracy <- rbind(models_accuracy,data.frame(model.name = "Neural network",mse=mse , mae = mae,mape = mape))

#  Accuracy of different models are

models_accuracy
##       model.name      mse      mae     mape
## 1           STLF 727.3467 21.52041 27.00349
## 2 simple forecas 799.8863 22.38884 29.00822
## 3 Neural network 542.6518 18.54749 20.78775